1   /*
2    * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.
8    *
9    * This code is distributed in the hope that it will be useful, but WITHOUT
10   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12   * version 2 for more details (a copy is included in the LICENSE file that
13   * accompanied this code).
14   *
15   * You should have received a copy of the GNU General Public License version
16   * 2 along with this work; if not, write to the Free Software Foundation,
17   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18   *
19   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20   * or visit www.oracle.com if you need additional information or have any
21   * questions.
22   */
23  
24  /*
25   * @test
26   * @bug 4486841 7040220
27   * @summary Test UTF-8 charset
28   */
29  
30  import java.nio.charset.*;
31  import java.nio.*;
32  import java.util.*;
33  
34  public class TestUTF8 {
35      static char[] decode(byte[] bb, String csn, boolean testDirect)
36          throws Exception {
37          CharsetDecoder dec = Charset.forName(csn).newDecoder();
38          ByteBuffer bbf;
39          CharBuffer cbf;
40          if (testDirect) {
41              bbf = ByteBuffer.allocateDirect(bb.length);
42              cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
43              bbf.put(bb).flip();
44          } else {
45              bbf = ByteBuffer.wrap(bb);
46              cbf = CharBuffer.allocate(bb.length);
47          }
48          CoderResult cr = dec.decode(bbf, cbf, true);
49          if (cr != CoderResult.UNDERFLOW)
50              throw new RuntimeException("Decoding err: " + csn);
51          char[] cc = new char[cbf.position()];
52          cbf.flip(); cbf.get(cc);
53          return cc;
54  
55      }
56  
57      static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
58          throws Exception {
59          CharsetDecoder dec = Charset.forName(csn).newDecoder();
60          ByteBuffer bbf;
61          CharBuffer cbf;
62          if (testDirect) {
63              bbf = ByteBuffer.allocateDirect(bb.length);
64              cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
65              bbf.put(bb).flip();
66          } else {
67              bbf = ByteBuffer.wrap(bb);
68              cbf = CharBuffer.allocate(bb.length);
69          }
70          return dec.decode(bbf, cbf, true);
71      }
72  
73      // copy/paste of the StringCoding.decode()
74      static char[] decode(Charset cs, byte[] ba, int off, int len) {
75          CharsetDecoder cd = cs.newDecoder();
76          int en = (int)(len * cd.maxCharsPerByte());
77          char[] ca = new char[en];
78          if (len == 0)
79              return ca;
80          cd.onMalformedInput(CodingErrorAction.REPLACE)
81            .onUnmappableCharacter(CodingErrorAction.REPLACE)
82            .reset();
83  
84          ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
85          CharBuffer cb = CharBuffer.wrap(ca);
86          try {
87              CoderResult cr = cd.decode(bb, cb, true);
88              if (!cr.isUnderflow())
89                  cr.throwException();
90              cr = cd.flush(cb);
91              if (!cr.isUnderflow())
92                  cr.throwException();
93          } catch (CharacterCodingException x) {
94              throw new Error(x);
95          }
96          return Arrays.copyOf(ca, cb.position());
97      }
98  
99      static byte[] encode(char[] cc, String csn, boolean testDirect)
100         throws Exception {
101         ByteBuffer bbf;
102         CharBuffer cbf;
103         CharsetEncoder enc = Charset.forName(csn).newEncoder();
104         if (testDirect) {
105             bbf = ByteBuffer.allocateDirect(cc.length * 4);
106             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
107             cbf.put(cc).flip();
108         } else {
109             bbf = ByteBuffer.allocate(cc.length * 4);
110             cbf = CharBuffer.wrap(cc);
111         }
112 
113         CoderResult cr = enc.encode(cbf, bbf, true);
114         if (cr != CoderResult.UNDERFLOW)
115             throw new RuntimeException("Encoding err: " + csn);
116         byte[] bb = new byte[bbf.position()];
117         bbf.flip(); bbf.get(bb);
118         return bb;
119     }
120 
121     static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
122         throws Exception {
123         ByteBuffer bbf;
124         CharBuffer cbf;
125         CharsetEncoder enc = Charset.forName(csn).newEncoder();
126         if (testDirect) {
127             bbf = ByteBuffer.allocateDirect(cc.length * 4);
128             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
129             cbf.put(cc).flip();
130         } else {
131             bbf = ByteBuffer.allocate(cc.length * 4);
132             cbf = CharBuffer.wrap(cc);
133         }
134         return enc.encode(cbf, bbf, true);
135     }
136 
137     static char[] getUTFChars() {
138         char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
139                              (0x110000 - 0x10000) * 2];    //supp
140         int pos = 0;
141         int i = 0;
142         for (i = 0; i < 0xd800; i++)
143             cc[pos++] = (char)i;
144         for (i = 0xe000; i < 0x10000; i++)
145             cc[pos++] = (char)i;
146         for (i = 0x10000; i < 0x110000; i++) {
147             pos += Character.toChars(i, cc, pos);
148         }
149         return cc;
150     }
151 
152     static int to3ByteUTF8(char c, byte[] bb, int pos) {
153         bb[pos++] = (byte)(0xe0 | ((c >> 12)));
154         bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
155         bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
156         return 3;
157     }
158 
159     static void checkRoundtrip(String csn) throws Exception {
160         System.out.printf("    Check roundtrip <%s>...", csn);
161         char[] cc = getUTFChars();
162         byte[] bb = encode(cc, csn, false);
163         char[] ccO = decode(bb, csn, false);
164 
165         if (!Arrays.equals(cc, ccO)) {
166             System.out.printf("    non-direct failed");
167         }
168         bb = encode(cc, csn, true);
169         ccO = decode(bb, csn, true);
170         if (!Arrays.equals(cc, ccO)) {
171             System.out.print("    (direct) failed");
172         }
173         // String.getBytes()/toCharArray() goes to ArrayDe/Encoder path
174         if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
175             System.out.printf("    String.getBytes() failed");
176         }
177         if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
178             System.out.printf("    String.toCharArray() failed");
179         }
180         System.out.println();
181     }
182 
183     static void check6ByteSurrs(String csn) throws Exception {
184         System.out.printf("    Check 6-byte Surrogates <%s>...%n", csn);
185         byte[] bb = new byte[(0x110000 - 0x10000) * 6];
186         char[] cc = new char[(0x110000 - 0x10000) * 2];
187         int bpos = 0;
188         int cpos = 0;
189         for (int i = 0x10000; i < 0x110000; i++) {
190             Character.toChars(i, cc, cpos);
191             bpos += to3ByteUTF8(cc[cpos], bb, bpos);
192             bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
193             cpos += 2;
194         }
195 
196         char[] ccO = decode(bb, csn, false);
197         if (!Arrays.equals(cc, ccO)) {
198             System.out.printf("    decoding failed%n");
199         }
200         ccO = decode(bb, csn, true);
201         if (!Arrays.equals(cc, ccO)) {
202             System.out.printf("    decoding(direct) failed%n");
203         }
204         // new String(bb, csn).getBytes(csn) will not return
205         // the 6 bytes surrogates as in bb, so only test
206         // toCharArray() here.
207         if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
208             System.out.printf("    String.toCharArray() failed");
209         }
210     }
211 
212     static void compare(String csn1, String csn2) throws Exception {
213         System.out.printf("    Diff <%s> <%s>...%n", csn1, csn2);
214         char[] cc = getUTFChars();
215 
216         byte[] bb1 = encode(cc, csn1, false);
217         byte[] bb2 = encode(cc, csn2, false);
218         if (!Arrays.equals(bb1, bb2))
219             System.out.printf("        encoding failed%n");
220         char[] cc1 = decode(bb1, csn1, false);
221         char[] cc2 = decode(bb1, csn2, false);
222         if (!Arrays.equals(cc1, cc2)) {
223             System.out.printf("        decoding failed%n");
224         }
225 
226         bb1 = encode(cc, csn1, true);
227         bb2 = encode(cc, csn2, true);
228         if (!Arrays.equals(bb1, bb2))
229             System.out.printf("        encoding (direct) failed%n");
230         cc1 = decode(bb1, csn1, true);
231         cc2 = decode(bb1, csn2, true);
232         if (!Arrays.equals(cc1, cc2)) {
233             System.out.printf("        decoding (direct) failed%n");
234         }
235     }
236 
237     // The first byte is the length of malformed bytes
238     static byte[][] malformed = {
239         // One-byte sequences:
240         {1, (byte)0xFF },
241         {1, (byte)0xC0 },
242         {1, (byte)0x80 },
243 
244         {1, (byte)0xFF, (byte)0xFF}, // all ones
245         {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
246 
247         // Two-byte sequences:
248         {1, (byte)0xC0, (byte)0x80}, // invalid first byte
249         {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
250         {1, (byte)0xC2, (byte)0x00}, // invalid second byte
251         {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
252         {1, (byte)0xD0, (byte)0x00}, // invalid second byte
253         {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
254         {1, (byte)0xDF, (byte)0x00}, // invalid second byte
255         {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
256 
257         // Three-byte sequences
258         {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
259         {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
260         {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
261         {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
262 
263         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
264         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
265         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
266         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
267         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
268         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
269 
270         // Four-byte sequences
271         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
272         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
273         {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
274         {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
275 
276         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
277         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
278         {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
279         {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
280         {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte
281 
282         {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
283         {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
284         {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
285         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
286         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
287         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
288 
289         // Five-byte sequences
290         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
291         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
292         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
293         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
294         {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
295 
296         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
297         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
298         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
299         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
300 
301         // Six-byte sequences
302         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
303         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
304         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
305         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
306         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
307         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
308         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
309         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
310         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
311     };
312 
313     static void checkMalformed(String csn) throws Exception {
314         boolean failed = false;
315         System.out.printf("    Check malformed <%s>...%n", csn);
316         Charset cs = Charset.forName(csn);
317         for (boolean direct: new boolean[] {false, true}) {
318             for (byte[] bins : malformed) {
319                 int mlen = bins[0];
320                 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
321                 CoderResult cr = decodeCR(bin, csn, direct);
322                 String ashex = "";
323                 for (int i = 0; i < bin.length; i++) {
324                     if (i > 0) ashex += " ";
325                         ashex += Integer.toBinaryString((int)bin[i] & 0xff);
326                 }
327                 if (!cr.isMalformed()) {
328                     System.out.printf("        FAIL(direct=%b): [%s] not malformed.%n", direct, ashex);
329                     failed = true;
330                 } else if (cr.length() != mlen) {
331                     System.out.printf("        FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length());
332                     failed = true;
333                 }
334                 if (!Arrays.equals(decode(cs, bin, 0, bin.length),
335                                    new String(bin, csn).toCharArray())) {
336                     System.out.printf("        FAIL(new String(bb, %s)) failed%n", csn);
337                     failed = true;
338                 }
339             }
340         }
341         if (failed)
342             throw new RuntimeException("Check malformed failed " + csn);
343     }
344 
345     static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
346         int inPos = flow[0];
347         int inLen = flow[1];
348         int outPos = flow[2];
349         int outLen = flow[3];
350         int expedInPos = flow[4];
351         int expedOutPos = flow[5];
352         CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
353                                           :CoderResult.OVERFLOW;
354         ByteBuffer bbf;
355         CharBuffer cbf;
356         if (direct) {
357             bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
358             cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
359         } else {
360             bbf = ByteBuffer.allocate(inPos + utf8s.length);
361             cbf = CharBuffer.allocate(outPos + outLen);
362         }
363         bbf.position(inPos);
364         bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
365         cbf.position(outPos);
366         dec.reset();
367         CoderResult cr = dec.decode(bbf, cbf, false);
368         if (cr != expedCR ||
369             bbf.position() != expedInPos ||
370             cbf.position() != expedOutPos) {
371             System.out.printf("Expected(direct=%5b): [", direct);
372             for (int i:flow) System.out.print(" " + i);
373             System.out.println("]  CR=" + cr +
374                                ", inPos=" + bbf.position() +
375                                ", outPos=" + cbf.position());
376             return false;
377         }
378         return true;
379     }
380 
381     static void checkUnderOverflow(String csn) throws Exception {
382         System.out.printf("    Check under/overflow <%s>...%n", csn);
383         CharsetDecoder dec = Charset.forName(csn).newDecoder();
384         boolean failed = false;
385         byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
386         int    inlen = utf8s.length;
387 
388         for (int inoff = 0; inoff < 20; inoff++) {
389             for (int outoff = 0; outoff < 20; outoff++) {
390         int[][] Flows = {
391             //inpos, inLen, outPos,  outLen, inPosEP,   outposEP,   under(0)/over(1)
392             {inoff,  inlen, outoff,  1,      inoff + 1, outoff + 1, 1},
393             {inoff,  inlen, outoff,  2,      inoff + 3, outoff + 2, 1},
394             {inoff,  inlen, outoff,  3,      inoff + 6, outoff + 3, 1},
395             {inoff,  inlen, outoff,  4,      inoff + 6, outoff + 3, 1},
396             {inoff,  inlen, outoff,  5,      inoff + 10,outoff + 5, 0},
397              // underflow
398             {inoff,  1,     outoff,  5,      inoff + 1, outoff + 1, 0},
399             {inoff,  2,     outoff,  5,      inoff + 1, outoff + 1, 0},
400             {inoff,  3,     outoff,  5,      inoff + 3, outoff + 2, 0},
401             {inoff,  4,     outoff,  5,      inoff + 3, outoff + 2, 0},
402             {inoff,  5,     outoff,  5,      inoff + 3, outoff + 2, 0},
403             {inoff,  6,     outoff,  5,      inoff + 6, outoff + 3, 0},
404             {inoff,  7,     outoff,  5,      inoff + 6, outoff + 3, 0},
405             {inoff,  8,     outoff,  5,      inoff + 6, outoff + 3, 0},
406             {inoff,  9,     outoff,  5,      inoff + 6, outoff + 3, 0},
407             {inoff,  10,    outoff,  5,      inoff + 10,outoff + 5, 0},
408              // 2-byte underflow/overflow
409             {inoff,  2,     outoff,  1,      inoff + 1, outoff + 1, 0},
410             {inoff,  3,     outoff,  1,      inoff + 1, outoff + 1, 1},
411              // 3-byte underflow/overflow
412             {inoff,  4,     outoff,  2,      inoff + 3, outoff + 2, 0},
413             {inoff,  5,     outoff,  2,      inoff + 3, outoff + 2, 0},
414             {inoff,  6,     outoff,  2,      inoff + 3, outoff + 2, 1},
415              // 4-byte underflow/overflow
416             {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
417             {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
418             {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
419             {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
420         };
421         for (boolean direct: new boolean[] {false, true}) {
422             for (int[] flow: Flows) {
423                 if (!check(dec, utf8s, direct, flow))
424                     failed = true;
425             }
426         }}}
427         if (failed)
428             throw new RuntimeException("Check under/overflow failed " + csn);
429     }
430 
431     public static void main(String[] args) throws Exception {
432         checkRoundtrip("UTF-8");
433         check6ByteSurrs("UTF-8");
434         //compare("UTF-8", "UTF-8-OLD");
435         checkMalformed("UTF-8");
436         checkUnderOverflow("UTF-8");
437     }
438 }